Code
library(tidyverse)
library(Polychrome)
library(withr)
library(fs)
library(compositions)
library(vegan)
library(ggVennDiagram)
library(qvalue)
source("data/input/utils_parallelism.R")Only ancestral, streptomycin and control treatments are included, unless otherwise stated.
Prior to this analysis, the variants have been filtered using Mutect2’s FilterMutectCalls as well as separately for SNPs and INDELs using the following hard filtering factors: SOR, FS, MQ, MQRankSum, ReadPosRankSum, SNP/INDEL clustering (3/35) and SNP/INDEL proximity.
library(tidyverse)
library(Polychrome)
library(withr)
library(fs)
library(compositions)
library(vegan)
library(ggVennDiagram)
library(qvalue)
source("data/input/utils_parallelism.R")Nonsynonymous variant allele frequencies of the significantly parallel genes (see the table in the section parallelism, inspecting individual genes (2.3)). Gene names used where available, otherwise prokka annotations or descriptions from eggnog are used in the title. Low abundance strains (from the 16S amplicon data) indicated with stars (* if the abundance < 0.05; ** if < 0.01). Heatmaps only include parallel genes, hence if a gene was found significantly parallel in the t0 but not in t3, the panel only contains variants of that gene for t0.
species_counts_md <- read.delim("data/input/species_counts_md.tsv")
counts <- species_counts_md %>% filter(day == 0 | day == 84) %>%
group_by(measure_env, evolution_env, replicate, day) %>%
mutate(read_sum = sum(count_correct)) %>%
ungroup() %>%
mutate(proportion = count_correct/read_sum)
par_genes <- read_tsv("data/results/parallel_genes_incl_t0.tsv")
variants_strep <- read_rds("data/input/variants_strep.rds")
variants_par <- inner_join(variants_strep, par_genes, by = join_by(Home, Measure, GENE == "locus_tag")) %>%
mutate(gene_label = ifelse(Preferred_name == "-" | is.na(Preferred_name), prokka_annotation, Preferred_name)) %>%
mutate(gene_label = ifelse(gene_label == "hypothetical protein", Description, gene_label)) %>%
mutate(gene_label = ifelse(gene_label == "-" | is.na(gene_label), GENE, gene_label)) %>%
select(HAMBI, GENE, CHROM, POS, ALT, VAF, home_env, measure_env, replicate, gene_label) %>%
group_by(GENE, home_env, measure_env) %>%
mutate(count_repl = n_distinct(replicate)) %>%
ungroup() %>%
mutate(home_repl = paste(home_env, replicate, sep = "___"))
variants_par_t3 <- variants_par %>%
filter(measure_env != "t0")
variants_par_t0 <- variants_par %>%
filter(measure_env == "t0",
home_repl %in% variants_par_t3$home_repl)
genes_par <- bind_rows(variants_par_t3, variants_par_t0) %>%
mutate(var = paste0(GENE, "_", POS, "_", ALT, " in replicate ", replicate),
timepoint = ifelse(measure_env == "t0", "t0", "t3"),
name = ifelse(home_env == "Home: Anc",
paste("ANC", measure_env, sep = " -> \n"),
ifelse(timepoint == "t0",
"t0",
paste(measure_env, sep = "_"))),
name = factor(name, levels = c("t0", "Meas: B", "Meas: BS", "ANC -> \nMeas: B", "ANC -> \nMeas: BS"))) %>%
mutate(measure_env2 = ifelse(measure_env == "t0", "none",
ifelse(measure_env == "Meas: BS", "bact_strep", "bact")),
home_env2 = ifelse(home_env == "Home: B", "bact",
ifelse(home_env == "Home: BS", "bact_strep",
"anc")))
counts <- counts %>%
mutate(day = ifelse(day == 0, "t0",
ifelse(day == 84, "t3", day)))
genes_par <- left_join(genes_par, select(counts, proportion, count_correct, day, strainID, replicate, evolution_env, measure_env),
by = join_by(home_env2 == evolution_env, measure_env2 == measure_env, replicate, HAMBI == strainID)) %>%
mutate(low_ab = ifelse(proportion >= 0.05, "",
ifelse(proportion <= 0.01, "**", "*")))Aims: (1) Do similar variants appear in Home: B and Home: ANC, especially in response to the streptomycin treatment (“Meas: BS” and “ANC -> Meas: BS”, on the x-axis). For example variants in HAMBI_1977 rsmG appear in both Home: B and Home: ANC when introduced to streptomycin. (2) What happens to the variants that were in parallel genes in t0. (3) Do new variants appear in t3.
genes_par %>%
filter(home_env == "Home: B" | home_env == "Home: Anc") %>%
ggplot(aes(y = var, x = name, fill = VAF)) +
geom_tile() +
geom_text(aes(y = var, x = name, label = low_ab), color = "white") +
facet_wrap(HAMBI ~ gene_label, scales = "free", ncol = 3, labeller = label_wrap_gen()) +
scale_fill_gradient(low = "darkblue", high = "hotpink") #theme(axis.text.x = element_text(angle = 45, vjust = 0.5, hjust=0))Aims: (1) What happens to the variants in t0 parallel genes if the stress is removed (Meas: B) versus if the stress remains (Meas: BS). (2) Do new variants appear in the t3 parallel genes.
genes_par %>%
filter(home_env == "Home: BS") %>%
ggplot(aes(y = var, x = name, fill = VAF)) +
geom_tile() +
geom_text(aes(y = var, x = name, label = low_ab), color = "white") +
facet_wrap(HAMBI ~ gene_label, scales = "free", ncol = 3, labeller = label_wrap_gen()) +
scale_fill_gradient(low = "darkblue", high = "hotpink")How are parallel genes distributed across different home and measurement conditions?
venn_prep <- genes_par %>%
mutate(name = paste(home_env, measure_env, sep = " ; ")) %>%
select(GENE, name) %>%
distinct()
venn_prep <- split(venn_prep, venn_prep$name) %>%
map(\(x) unique(x$GENE))
heatmap_prep <- genes_par %>%
mutate(name = paste(home_env, measure_env, sep = " ; ")) %>%
select(GENE, name) %>%
distinct() %>%
mutate(value = 1) %>%
pivot_wider(names_from = "GENE", values_from = "value", values_fill = 0) %>%
column_to_rownames(var = "name")
x <- as.matrix(heatmap_prep)
p <- ggVennDiagram(venn_prep, order.set.by = c("name"))
venn_table <- p$plotlist[[2]]$data %>%
arrange(id) %>%
select(name, item) %>%
mutate(item = str_remove_all(item, "c\\(|\\)")) %>%
separate_longer_delim(item, ",") %>%
mutate(item = str_trim(item),
item = str_remove_all(item, "\""),
item = str_remove_all(item, "character\\(0")) %>%
left_join(distinct(select(variants_par, GENE, gene_label)), by = join_by(item == GENE)) %>%
mutate(gene_label = paste(item, gene_label)) %>%
select(-item) %>%
group_by(name) %>%
mutate(number = 1:n()) %>%
pivot_wider(names_from = name, values_from = gene_label) %>%
select(-number)heatmap(x,
distfun = function(x) vegan::vegdist(x, method="jaccard", binary=TRUE),
hclustfun = function(x) hclust(x, method="complete"),
scale="none", col = c("white", "black"))Instead of a Venn diagram, using UpSet plot to visualise how many parallel genes were in different home+measurement conditions. (Pretty good explanation on how these plots work can be found at https://upset.app/). The shared parallel genes are listed in the table below and should be in the same order as the bars / intersections.
prmarkdown::paged_table(venn_table)